howell <- read.table("http://hompal-stats.wabarr.com/datasets/Howell_craniometry.csv", header=TRUE, sep=",")
library(dplyr)
howell <- howell %>% dplyr::select(ID, Sex, Population, BNL, MDH, EKB, ZOR, BAA, NBA)
head(howell)
## ID Sex Population BNL MDH EKB ZOR BAA NBA
## 1 1 M NORSE 100 31 100 81 39 76
## 2 2 M NORSE 102 19 96 84 35 79
## 3 3 M NORSE 102 28 97 82 38 72
## 4 4 M NORSE 100 25 99 79 46 75
## 5 5 M NORSE 97 26 97 79 42 80
## 6 6 M NORSE 106 29 98 88 39 77
pairs(howell[,4:9])
cov(howell[,4:9])
## BNL MDH EKB ZOR BAA NBA
## BNL 33.7394473 8.8005539 14.2232895 22.304205 -2.2056656 -0.8760075
## MDH 8.8005539 14.9809962 6.3351161 5.002197 3.0393233 0.4381316
## EKB 14.2232895 6.3351161 17.9696276 12.515685 0.1239454 -1.8134923
## ZOR 22.3042053 5.0021967 12.5156847 24.287058 -1.6050285 -3.1503620
## BAA -2.2056656 3.0393233 0.1239454 -1.605029 10.2812444 0.6143717
## NBA -0.8760075 0.4381316 -1.8134923 -3.150362 0.6143717 11.2139906
#gives the same results
var(howell[,4:9])
## BNL MDH EKB ZOR BAA NBA
## BNL 33.7394473 8.8005539 14.2232895 22.304205 -2.2056656 -0.8760075
## MDH 8.8005539 14.9809962 6.3351161 5.002197 3.0393233 0.4381316
## EKB 14.2232895 6.3351161 17.9696276 12.515685 0.1239454 -1.8134923
## ZOR 22.3042053 5.0021967 12.5156847 24.287058 -1.6050285 -3.1503620
## BAA -2.2056656 3.0393233 0.1239454 -1.605029 10.2812444 0.6143717
## NBA -0.8760075 0.4381316 -1.8134923 -3.150362 0.6143717 11.2139906
distMat <- dist(x = howell[,4:9], method = "euclid")
HowellClust <- hclust(d = distMat, method = "average")
plot(HowellClust)
detach("package:dplyr", unload=TRUE)
library(MASS)
howellDFA <- lda(Population ~ BNL + MDH + EKB + ZOR + BAA + NBA, data=howell)
howellDFA
## Call:
## lda(Population ~ BNL + MDH + EKB + ZOR + BAA + NBA, data = howell)
##
## Prior probabilities of groups:
## AINU ANDAMAN ANYANG ARIKARA ATAYAL AUSTRALI
## 0.034072900 0.027733756 0.016640254 0.027337559 0.018621236 0.040015848
## BERG BURIAT BUSHMAN DOGON EASTER I EGYPT
## 0.043185420 0.043185420 0.035657686 0.039223455 0.034072900 0.043977813
## ESKIMO GUAM HAINAN MOKAPU MORIORI N JAPAN
## 0.042789223 0.022583201 0.032884311 0.039619651 0.042789223 0.034469097
## N MAORI NORSE PERU PHILLIPI S JAPAN S MAORI
## 0.003961965 0.043581616 0.043581616 0.019809826 0.036053883 0.003961965
## SANTA CR TASMANIA TEITA TOLAI ZALAVAR ZULU
## 0.040412044 0.034469097 0.032884311 0.043581616 0.038827258 0.040015848
##
## Group means:
## BNL MDH EKB ZOR BAA NBA
## AINU 103.65116 27.65116 100.36047 81.19767 37.59302 78.50000
## ANDAMAN 91.64286 24.17143 91.85714 72.65714 37.17143 80.15714
## ANYANG 101.28571 30.59524 98.78571 81.09524 40.76190 81.19048
## ARIKARA 100.75362 26.92754 97.91304 79.65217 41.37681 77.59420
## ATAYAL 96.38298 25.59574 95.19149 76.17021 38.91489 80.40426
## AUSTRALI 99.15842 27.37624 99.82178 81.00000 36.20792 74.76238
## BERG 95.83486 27.00000 96.99083 78.09174 40.94495 76.80734
## BURIAT 99.30275 27.74312 99.55963 84.33945 43.03670 75.69725
## BUSHMAN 93.07778 23.26667 95.32222 77.54444 35.65556 74.02222
## DOGON 96.58586 27.02020 97.12121 76.94949 37.75758 78.76768
## EASTER I 108.34884 27.59302 97.95349 85.61628 35.88372 79.31395
## EGYPT 98.81982 27.81982 94.02703 77.45045 40.13514 77.21622
## ESKIMO 103.31481 25.25000 98.41667 86.06481 39.65741 78.53704
## GUAM 102.63158 29.10526 99.57895 81.52632 39.57895 80.87719
## HAINAN 97.30120 27.54217 96.14458 78.45783 41.09639 81.27711
## MOKAPU 104.28000 28.29000 98.00000 82.71000 37.34000 79.65000
## MORIORI 103.59259 29.87037 98.82407 81.62037 40.87963 76.98148
## N JAPAN 99.33333 29.20690 97.25287 79.70115 41.41379 80.00000
## N MAORI 105.00000 32.80000 99.40000 83.20000 38.00000 77.80000
## NORSE 99.55455 27.78182 96.87273 80.03636 39.88182 75.45455
## PERU 93.29091 28.13636 93.11818 72.78182 41.55455 78.76364
## PHILLIPI 98.48000 29.62000 98.36000 79.46000 39.76000 79.86000
## S JAPAN 99.09890 27.89011 95.75824 79.56044 40.26374 80.80220
## S MAORI 105.60000 31.70000 100.20000 83.80000 39.70000 77.70000
## SANTA CR 95.04902 27.02941 97.01961 76.53922 40.63725 77.19608
## TASMANIA 97.34483 24.59770 98.51724 78.72414 35.37931 77.79310
## TEITA 98.72289 26.19277 97.55422 81.08434 37.16867 75.93976
## TOLAI 98.60000 27.45455 98.76364 81.85455 36.87273 79.70000
## ZALAVAR 99.09184 28.04082 96.56122 78.63265 39.78571 78.03061
## ZULU 99.85149 27.13861 99.45545 79.54455 38.14851 77.05941
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3 LD4 LD5
## BNL -0.146996168 -0.006578434 0.15602185 0.251366265 0.008445963
## MDH 0.001150378 -0.079098956 0.06847894 0.003796849 -0.089985789
## EKB 0.131146953 0.061956012 -0.05293233 -0.066234722 -0.271118512
## ZOR -0.123586303 0.160200307 -0.23079779 -0.201077715 0.119939453
## BAA -0.250462354 -0.270173233 -0.17932627 0.005994658 -0.002678732
## NBA -0.183717324 0.061478572 0.18034978 -0.263888987 -0.021081306
## LD6
## BNL 0.1230614
## MDH -0.2913560
## EKB 0.1378057
## ZOR -0.1514408
## BAA 0.1275435
## NBA 0.0360990
##
## Proportion of trace:
## LD1 LD2 LD3 LD4 LD5 LD6
## 0.3472 0.2690 0.1871 0.1097 0.0583 0.0287
predictions <- predict(howellDFA, newdata = howell[,4:9])
sum(predictions$class == howell$Population) / nrow(howell)
## [1] 0.3098257
library(ggplot2)
qplot(x=LD1, y=LD2, color = howell$Population, data=as.data.frame(predictions$x), size=I(5)) + theme_bw(30)
sum(predictions$class == howell$Population) / length(predictions$class)
## [1] 0.3098257
It depends on how you look at it. It the ~30% correct classification rate isn’t great, but we would only expect about 1/30 = 3.3333333% by chance, so it isn’t as bad as it seems.